In [1]:
import pandas as pd
#!pip install tqdm
from tqdm import tqdm #progress bar
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import STL
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy.stats import pearsonr
from scipy import stats
import requests
import json
import time
import matplotlib.dates as mdates
In [2]:
excel_file_path = r"E:\Columbia University\APAN Term3\Capstone\RC Projection data since 2022 28AUG24.xlsx"

excel_file = pd.ExcelFile(excel_file_path)

sheets_to_load = ['2022', '2023', '2024', 'Workday Calendar', 'Location hierarchy', 'Fleet Count']

dfs = {}

for sheet in tqdm(sheets_to_load, desc="Loading sheets"):
    dfs[sheet] = pd.read_excel(excel_file, sheet_name=sheet)

df_2022 = dfs['2022']
df_2023 = dfs['2023']
df_2024 = dfs['2024']
workday_calendar = dfs['Workday Calendar']
location_df = dfs['Location hierarchy']
fleet_count = dfs['Fleet Count']
Loading sheets: 100%|████████████████████████████████████████████████████████████████████| 6/6 [02:15<00:00, 22.53s/it]
In [3]:
df_total = pd.concat([df_2022, df_2023, df_2024], axis=0, ignore_index=True)
df_cleaned = df_total.loc[:, ~df_total.columns.str.contains('^Unnamed')]
In [4]:
workday_months = workday_calendar[['MONTH', 'WD_IN_MNTH', 'DAYS_IN_MNTH', 'SATURDAY_COUNT', 'WEEKDAY_HOLIDAYS']]
workday_months = workday_months.dropna(how='all')

workday_days = workday_calendar[['MONTH.1', 'DT', 'DT_KEY', 'WD_IN_MNTH.1', 'DAYS_IN_MNTH.1', 'SATURDAY_COUNT.1', 'WEEKDAY_HOLIDAYS.1',
                                'Saturdays in Month', 'Workdays in Month', 'Max Workday', 'Max Saturday',
                                'WD Remaining', 'Saturday Remaining']]

Month¶

In [5]:
# Group the data by MONTH and calculate the sum of TRANSACTION_AMOUNT for each month
monthly_data = df_cleaned.groupby('MONTH')['TRANSACTION_AMOUNT'].sum().reset_index()
In [6]:
# Plotting the monthly transaction amounts
plt.figure(figsize=(12, 6))
plt.plot(monthly_data['MONTH'], monthly_data['TRANSACTION_AMOUNT'], marker='o', linestyle='-')
plt.title('Monthly Transaction Amounts (Historical Data)')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.xticks(rotation=45)
plt.grid()
plt.tight_layout()

# Display the plot
plt.show()
In [7]:
# Group the data by YEAR and MONTH to calculate the sum of TRANSACTION_AMOUNT for each month
monthly_data_by_year = df_cleaned.groupby(['YEAR', 'MONTH'])['TRANSACTION_AMOUNT'].sum().reset_index()

# Create a plot for each year
plt.figure(figsize=(15, 10))

# Plot for 2022
plt.subplot(3, 1, 1)  # 3 rows, 1 column, first subplot
data_2022 = monthly_data_by_year[monthly_data_by_year['YEAR'] == '2022-01-01']
plt.plot(data_2022['MONTH'], data_2022['TRANSACTION_AMOUNT'], marker='o', linestyle='-', color='b')
plt.title('Monthly Transaction Amounts for 2022')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.xticks(rotation=45)
plt.grid()

# Plot for 2023
plt.subplot(3, 1, 2)  # 3 rows, 1 column, second subplot
data_2023 = monthly_data_by_year[monthly_data_by_year['YEAR'] == '2023-01-01']
plt.plot(data_2023['MONTH'], data_2023['TRANSACTION_AMOUNT'], marker='o', linestyle='-', color='g')
plt.title('Monthly Transaction Amounts for 2023')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.xticks(rotation=45)
plt.grid()

# Plot for 2024
plt.subplot(3, 1, 3)  # 3 rows, 1 column, third subplot
data_2024 = monthly_data_by_year[monthly_data_by_year['YEAR'] == '2024-01-01']
plt.plot(data_2024['MONTH'], data_2024['TRANSACTION_AMOUNT'], marker='o', linestyle='-', color='r')
plt.title('Monthly Transaction Amounts for 2024')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.xticks(rotation=45)
plt.grid()

# Adjust layout for better spacing
plt.tight_layout()

# Display the plot
plt.show()

Day¶

2022¶

In [8]:
# Daily Historical Cost for each Month
plt.figure(figsize=(20, 15))

for month in range(1, 13):  # 1 to 12 for 2022
    month_data_22 = df_2022[df_2022['MONTH'].dt.month == month]
    daily_cost_22 = month_data_22.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
    
    plt.subplot(3, 4, month)  # 3 rows, 4 columns, month plot
    plt.plot(daily_cost_22['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_22['TRANSACTION_AMOUNT'], linestyle='-', color='b', linewidth=2)  # Adjusted line width
    plt.title(f'Daily Costs for 2022-{month:02d}')
    plt.xlabel('Date')
    plt.ylabel('Total Daily Cost')
    
    # Format x-ticks to show only the date (day and month)
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
    plt.grid()

plt.tight_layout()
plt.show()
In [9]:
# Create a plot for each month of 2022
plt.figure(figsize=(20, 15))

for month in range(1, 13):  # 1 to 12 for 2022
    month_data_22 = df_2022[df_2022['MONTH'].dt.month == month]
    daily_cost_22 = month_data_22.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
    daily_cost_22['CUMULATIVE_TRANSACTION_COST'] = daily_cost_22['TRANSACTION_AMOUNT'].cumsum()
    
    plt.subplot(3, 4, month)  # 3 rows, 4 columns, month plot
    plt.plot(daily_cost_22['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_22['CUMULATIVE_TRANSACTION_COST'], 
             linestyle='-', color='b', linewidth=2)

    plt.title(f'Cumulative Costs for 2022-{month:02d}')
    plt.xlabel('Date') 
    plt.ylabel('Cumulative Total Cost') 
    
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))  # Show only day and month
    plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
    plt.grid()

plt.tight_layout()
plt.show()
In [10]:
# Create a plot for each month of 2022
plt.figure(figsize=(20, 15))

for month in range(1, 13):  # 1 to 12 for 2022
    month_data_22 = df_2022[df_2022['MONTH'].dt.month == month]
    
    # Count occurrences of FIN_SOURCE_TYPE_DESC
    fin_source_counts = month_data_22['FIN_SOURCE_TYPE_DESC'].value_counts().head(5)  # Get top 5 types
    
    plt.subplot(3, 4, month)  # 3 rows, 4 columns, month plot
    fin_source_counts.plot(kind='bar', color='gray')  # Bar plot for counts
    plt.title(f'Top 5 FIN_SOURCE_TYPE_DESC for 2022-{month:02d}')
    plt.xlabel('Financial Source Type')
    plt.ylabel('Count')
    #plt.xticks(rotation=15)  # Rotate x-ticks for better visibility
    plt.grid(axis='y')

plt.tight_layout()
plt.show()

2023¶

In [11]:
# Daily Historical Cost for each Month in 2023
plt.figure(figsize=(20, 15))

for month in range(1, 13):  # 1 to 12 for 2023
    month_data_23 = df_2023[df_2023['MONTH'].dt.month == month]
    daily_cost_23 = month_data_23.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
    
    plt.subplot(3, 4, month)  # 3 rows, 4 columns, month plot
    plt.plot(daily_cost_23['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_23['TRANSACTION_AMOUNT'], linestyle='-', color='orange', linewidth=1)  # Adjusted line width
    plt.title(f'Daily Costs for 2023-{month:02d}')
    plt.xlabel('Date')
    plt.ylabel('Total Daily Cost')
    
    # Format x-ticks to show only the date (day and month)
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
    plt.grid()

plt.tight_layout()
plt.show()
In [12]:
# Create a plot for each month of 2023
plt.figure(figsize=(20, 15))

for month in range(1, 13):  # 1 to 12 for 2023
    month_data_23 = df_2023[df_2023['MONTH'].dt.month == month]
    daily_cost_23 = month_data_23.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
    daily_cost_23['CUMULATIVE_TRANSACTION_COST'] = daily_cost_23['TRANSACTION_AMOUNT'].cumsum()
    
    plt.subplot(3, 4, month)  # 3 rows, 4 columns, month plot
    plt.plot(daily_cost_23['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_23['CUMULATIVE_TRANSACTION_COST'], 
             linestyle='-', color='orange', linewidth=1)

    plt.title(f'Cumulative Costs for 2023-{month:02d}')
    plt.xlabel('Date') 
    plt.ylabel('Cumulative Total Cost') 
    
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))  # Show only day and month
    plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
    plt.grid()

plt.tight_layout()
plt.show()

补一张Top 5 FIN_SOURCE_TYPE_DESC for 2023 (和上面2022 一样)

2024¶

In [13]:
# Daily Historical Cost for each Month in 2024
plt.figure(figsize=(20, 15))

for month in range(1, 9):  # 1 to 12 for 2024
    month_data_24 = df_2024[df_2024['MONTH'].dt.month == month]
    daily_cost_24 = month_data_24.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
    
    plt.subplot(2, 4, month)  # 3 rows, 4 columns, month plot
    plt.plot(daily_cost_24['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_24['TRANSACTION_AMOUNT'], linestyle='-', color='pink', linewidth=2)  # Adjusted line width
    plt.title(f'Daily Costs for 2024-{month:02d}')
    plt.xlabel('Date')
    plt.ylabel('Total Daily Cost')
    
    # Format x-ticks to show only the date (day and month)
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
    plt.grid()

plt.tight_layout()
plt.show()
In [14]:
# Daily Historical Cost for each Month in 2024
plt.figure(figsize=(20, 15))

for month in range(1, 9):  # 1 to 12 for 2024
    month_data_24 = df_2024[df_2024['MONTH'].dt.month == month]
    daily_cost_24 = month_data_24.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
    daily_cost_24['CUMULATIVE_TRANSACTION_COST'] = daily_cost_24['TRANSACTION_AMOUNT'].cumsum()
    
    plt.subplot(2, 4, month)  # 3 rows, 4 columns, month plot
    plt.plot(daily_cost_24['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_24['CUMULATIVE_TRANSACTION_COST'], linestyle='-', color='pink', linewidth=2)  # Adjusted line width
    plt.title(f'Cumulative Costs for 2024-{month:02d}')
    plt.xlabel('Date')
    plt.ylabel('Cumulative Daily Cost')
    
    # Format x-ticks to show only the date (day and month)
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
    plt.grid()

plt.tight_layout()
plt.show()

补一张Top 5 FIN_SOURCE_TYPE_DESC for 2024 (和上面2022 一样)

以上都是national base的,下面需要一样的图for region和location,有问题可以看他的recording 13分钟的那个

In [17]:
df_cleaned.head()
Out[17]:
MONTH YEAR GL_TRANSACTION_EFFECTIVE_DT GL_EFF_WD_NBR FIN_SOURCE_TYPE_DESC LOCATION_CD LOCATION_DESC BU_DESC REGION_DESC TRANSACTION_AMOUNT
0 2022-01-01 2022-01-01 2022-01-02 0 INTERDISTRICT BILLING 514 TUPELO OPS - MID SOUTH BU OPS-SOUTH REGION 52.0
1 2022-01-01 2022-01-01 2022-01-02 0 INTERDISTRICT BILLING 1554 MANSFIELD OPS - GREAT LAKES BU OPS-NORTH REGION 52.0
2 2022-01-01 2022-01-01 2022-01-02 0 INTERDISTRICT BILLING 30 LANDOVER MD OPS - DMV BU OPS-NORTH REGION 52.0
3 2022-01-01 2022-01-01 2022-01-02 0 INTERDISTRICT BILLING 284 BRIDGETON MISSOURI OPS - GREAT PLAINS BU OPS-CENTRAL REGION 208.0
4 2022-01-01 2022-01-01 2022-01-02 0 INTERDISTRICT BILLING 1278 SANFORD NC OPS - CAROLINAS BU OPS-SOUTH REGION 52.0
In [35]:
unique_regions = df_2024['REGION_DESC'].unique()
print(unique_regions)
['OPS-CENTRAL REGION ' 'OPS-SOUTH REGION ' 'OPS-NORTH REGION '
 'OPS-WEST REGION ']

Cumulative Costs for 2022 by region¶

In [38]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates


plt.figure(figsize=(20, 15))

regions = ['OPS-CENTRAL REGION ', 'OPS-SOUTH REGION ', 'OPS-NORTH REGION ', 'OPS-WEST REGION ']

for month in range(1, 13):  
    plt.subplot(3, 4, month) 
    for region in regions:
        month_data_22 = df_2022[(df_2022['MONTH'].dt.month == month) & (df_2022['REGION_DESC'] == region)]
        daily_cost_22 = month_data_22.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
        daily_cost_22['CUMULATIVE_TRANSACTION_COST'] = daily_cost_22['TRANSACTION_AMOUNT'].cumsum()
        
        plt.plot(daily_cost_22['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_22['CUMULATIVE_TRANSACTION_COST'], 
                 linestyle='-', label=region)

    plt.title(f'Cumulative Costs for 2022-{month:02d}')
    plt.xlabel('Date')
    plt.ylabel('Cumulative Daily Cost')
    
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    plt.xticks(rotation=45) 
    plt.grid()
    plt.legend()  

plt.tight_layout()
plt.show()

Cumulative Costs for 2023 by region¶

In [37]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(20, 15))

regions = ['OPS-CENTRAL REGION ', 'OPS-SOUTH REGION ', 'OPS-NORTH REGION ', 'OPS-WEST REGION ']

for month in range(1, 13):  
    plt.subplot(3, 4, month)  
    for region in regions:
        month_data_23 = df_2023[(df_2023['MONTH'].dt.month == month) & (df_2023['REGION_DESC'] == region)]
        daily_cost_23 = month_data_23.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
        daily_cost_23['CUMULATIVE_TRANSACTION_COST'] = daily_cost_23['TRANSACTION_AMOUNT'].cumsum()
        
        plt.plot(daily_cost_23['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_23['CUMULATIVE_TRANSACTION_COST'], 
                 linestyle='-', label=region)

    plt.title(f'Cumulative Costs for 2023-{month:02d}')
    plt.xlabel('Date')
    plt.ylabel('Cumulative Daily Cost')
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    plt.xticks(rotation=45) 
    plt.grid()
    plt.legend()  

plt.tight_layout()
plt.show()

Cumulative Costs for 2024 by region¶

In [39]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Create a figure for 8 months
plt.figure(figsize=(20, 10))

regions = ['OPS-CENTRAL REGION ', 'OPS-SOUTH REGION ', 'OPS-NORTH REGION ', 'OPS-WEST REGION ']

for month in range(1, 9):  #Only 1 to 8 month for 2024
    plt.subplot(2, 4, month)  
    for region in regions:
        month_data_24 = df_2024[(df_2024['MONTH'].dt.month == month) & (df_2024['REGION_DESC'] == region)]
        daily_cost_24 = month_data_24.groupby('GL_TRANSACTION_EFFECTIVE_DT')['TRANSACTION_AMOUNT'].sum().reset_index()
        daily_cost_24['CUMULATIVE_TRANSACTION_COST'] = daily_cost_24['TRANSACTION_AMOUNT'].cumsum()
        
        plt.plot(daily_cost_24['GL_TRANSACTION_EFFECTIVE_DT'], daily_cost_24['CUMULATIVE_TRANSACTION_COST'], 
                 linestyle='-', label=region)

    plt.title(f'Cumulative Costs for 2024-{month:02d}')
    plt.xlabel('Date')
    plt.ylabel('Cumulative Daily Cost')
    
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    plt.xticks(rotation=45)  
    plt.grid()
    plt.legend() 

plt.tight_layout()
plt.show()
In [ ]: